// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build !purego

#include "textflag.h"

// func keccakF1600NEON(a *[200]byte)
TEXT ·keccakF1600NEON(SB), $200-8
	MOVD	a+0(FP), R0
	MOVD	$round_consts<>(SB), R1
	MOVD	$24, R2 // counter for loop

	VLD1.P	16(R0), [V0.D1, V1.D1]
	VLD1.P	16(R0), [V2.D1, V3.D1]
	VLD1.P	16(R0), [V4.D1, V5.D1]
	VLD1.P	16(R0), [V6.D1, V7.D1]
	VLD1.P	16(R0), [V8.D1, V9.D1]
	VLD1.P	16(R0), [V10.D1, V11.D1]
	VLD1.P	16(R0), [V12.D1, V13.D1]
	VLD1.P	16(R0), [V14.D1, V15.D1]
	VLD1.P	16(R0), [V16.D1, V17.D1]
	VLD1.P	16(R0), [V18.D1, V19.D1]
	VLD1.P	16(R0), [V20.D1, V21.D1]
	VLD1.P	16(R0), [V22.D1, V23.D1]
	VLD1	(R0), [V24.D1]

	SUB	$192, R0, R0

loop:
	// theta
	VEOR3	 V20.B16, V15.B16, V10.B16, V25.B16
	VEOR3	 V21.B16, V16.B16, V11.B16, V26.B16
	VEOR3	 V22.B16, V17.B16, V12.B16, V27.B16
	VEOR3	 V23.B16, V18.B16, V13.B16, V28.B16
	VEOR3	 V24.B16, V19.B16, V14.B16, V29.B16
	VEOR3	 V25.B16, V5.B16, V0.B16, V25.B16
	VEOR3	 V26.B16, V6.B16, V1.B16, V26.B16
	VEOR3	 V27.B16, V7.B16, V2.B16, V27.B16
	VEOR3	 V28.B16, V8.B16, V3.B16, V28.B16
	VEOR3	 V29.B16, V9.B16, V4.B16, V29.B16

	VRAX1	V27.D2, V25.D2, V30.D2
	VRAX1	V28.D2, V26.D2, V31.D2
	VRAX1	V29.D2, V27.D2, V27.D2
	VRAX1	V25.D2, V28.D2, V28.D2
	VRAX1	V26.D2, V29.D2, V29.D2

	// theta and rho and Pi
	VEOR	V29.B16, V0.B16, V0.B16

	VXAR	$63, V30.D2, V1.D2, V25.D2

	VXAR	$20, V30.D2, V6.D2, V1.D2
	VXAR	$44, V28.D2, V9.D2, V6.D2
	VXAR	$3, V31.D2, V22.D2, V9.D2
	VXAR	$25, V28.D2, V14.D2, V22.D2
	VXAR	$46, V29.D2, V20.D2, V14.D2

	VXAR	$2, V31.D2, V2.D2, V26.D2

	VXAR	$21, V31.D2, V12.D2, V2.D2
	VXAR	$39, V27.D2, V13.D2, V12.D2
	VXAR	$56, V28.D2, V19.D2, V13.D2
	VXAR	$8, V27.D2, V23.D2, V19.D2
	VXAR	$23, V29.D2, V15.D2, V23.D2

	VXAR	$37, V28.D2, V4.D2, V15.D2

	VXAR	$50, V28.D2, V24.D2, V28.D2
	VXAR	$62, V30.D2, V21.D2, V24.D2
	VXAR	$9, V27.D2, V8.D2, V8.D2
	VXAR	$19, V30.D2, V16.D2, V4.D2
	VXAR	$28, V29.D2, V5.D2, V16.D2

	VXAR	$36, V27.D2, V3.D2, V5.D2

	VXAR	$43, V27.D2, V18.D2, V27.D2
	VXAR	$49, V31.D2, V17.D2, V3.D2
	VXAR	$54, V30.D2, V11.D2, V30.D2
	VXAR	$58, V31.D2, V7.D2, V31.D2
	VXAR	$61, V29.D2, V10.D2, V29.D2

	// chi and iota
	VBCAX	V8.B16, V22.B16, V26.B16, V20.B16
	VBCAX	V22.B16, V23.B16, V8.B16, V21.B16
	VBCAX	V23.B16, V24.B16, V22.B16, V22.B16
	VBCAX	V24.B16, V26.B16, V23.B16, V23.B16
	VBCAX	V26.B16, V8.B16, V24.B16, V24.B16

	VLD1R.P	8(R1), [V26.D2]

	VBCAX	V3.B16, V19.B16, V30.B16, V17.B16
	VBCAX	V19.B16, V15.B16, V3.B16, V18.B16
	VBCAX	V15.B16, V16.B16, V19.B16, V19.B16
	VBCAX	V16.B16, V30.B16, V15.B16, V15.B16
	VBCAX	V30.B16, V3.B16, V16.B16, V16.B16

	VBCAX	V31.B16, V12.B16, V25.B16, V10.B16
	VBCAX	V12.B16, V13.B16, V31.B16, V11.B16
	VBCAX	V13.B16, V14.B16, V12.B16, V12.B16
	VBCAX	V14.B16, V25.B16, V13.B16, V13.B16
	VBCAX	V25.B16, V31.B16, V14.B16, V14.B16

	VBCAX	V4.B16, V9.B16, V29.B16, V7.B16
	VBCAX	V9.B16, V5.B16, V4.B16, V8.B16
	VBCAX	V5.B16, V6.B16, V9.B16, V9.B16
	VBCAX	V6.B16, V29.B16, V5.B16, V5.B16
	VBCAX	V29.B16, V4.B16, V6.B16, V6.B16

	VBCAX	V28.B16, V0.B16, V27.B16, V3.B16
	VBCAX	V0.B16, V1.B16, V28.B16, V4.B16

	VBCAX	V1.B16, V2.B16, V0.B16, V0.B16  // iota (chi part)

	VBCAX	V2.B16, V27.B16, V1.B16, V1.B16
	VBCAX	V27.B16, V28.B16, V2.B16, V2.B16

	VEOR	V26.B16, V0.B16, V0.B16 // iota

	SUB		$1, R2, R2
	CBNZ	R2, loop

	VST1.P	[V0.D1, V1.D1], 16(R0)
	VST1.P	[V2.D1, V3.D1], 16(R0)
	VST1.P	[V4.D1, V5.D1], 16(R0)
	VST1.P	[V6.D1, V7.D1], 16(R0)
	VST1.P	[V8.D1, V9.D1], 16(R0)
	VST1.P	[V10.D1, V11.D1], 16(R0)
	VST1.P	[V12.D1, V13.D1], 16(R0)
	VST1.P	[V14.D1, V15.D1], 16(R0)
	VST1.P	[V16.D1, V17.D1], 16(R0)
	VST1.P	[V18.D1, V19.D1], 16(R0)
	VST1.P	[V20.D1, V21.D1], 16(R0)
	VST1.P	[V22.D1, V23.D1], 16(R0)
	VST1	[V24.D1], (R0)

	RET

DATA	round_consts<>+0x00(SB)/8, $0x0000000000000001
DATA	round_consts<>+0x08(SB)/8, $0x0000000000008082
DATA	round_consts<>+0x10(SB)/8, $0x800000000000808a
DATA	round_consts<>+0x18(SB)/8, $0x8000000080008000
DATA	round_consts<>+0x20(SB)/8, $0x000000000000808b
DATA	round_consts<>+0x28(SB)/8, $0x0000000080000001
DATA	round_consts<>+0x30(SB)/8, $0x8000000080008081
DATA	round_consts<>+0x38(SB)/8, $0x8000000000008009
DATA	round_consts<>+0x40(SB)/8, $0x000000000000008a
DATA	round_consts<>+0x48(SB)/8, $0x0000000000000088
DATA	round_consts<>+0x50(SB)/8, $0x0000000080008009
DATA	round_consts<>+0x58(SB)/8, $0x000000008000000a
DATA	round_consts<>+0x60(SB)/8, $0x000000008000808b
DATA	round_consts<>+0x68(SB)/8, $0x800000000000008b
DATA	round_consts<>+0x70(SB)/8, $0x8000000000008089
DATA	round_consts<>+0x78(SB)/8, $0x8000000000008003
DATA	round_consts<>+0x80(SB)/8, $0x8000000000008002
DATA	round_consts<>+0x88(SB)/8, $0x8000000000000080
DATA	round_consts<>+0x90(SB)/8, $0x000000000000800a
DATA	round_consts<>+0x98(SB)/8, $0x800000008000000a
DATA	round_consts<>+0xA0(SB)/8, $0x8000000080008081
DATA	round_consts<>+0xA8(SB)/8, $0x8000000000008080
DATA	round_consts<>+0xB0(SB)/8, $0x0000000080000001
DATA	round_consts<>+0xB8(SB)/8, $0x8000000080008008
GLOBL	round_consts<>(SB), NOPTR|RODATA, $192
